DATA IMPORTATION AND CLEANING

library(classdata)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.9
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(plyr)
## Warning: package 'plyr' was built under R version 4.1.3
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
library(matrixStats)
## Warning: package 'matrixStats' was built under R version 4.1.3
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:plyr':
## 
##     count
## The following object is masked from 'package:dplyr':
## 
##     count
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
## 
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#Columns used from the data tables.
Columns = c('X', 'Canada', 'Saudi.Arabia', 'United.States', 'India', 'Russian.Federation', 'South.Africa', 'Turkey', 'Brazil', 'Mexico', 'France', 'Germany', 'Italy', 'United.Kingdom', 'China', 'Sweden', 'Japan', 'Korea..Rep.', 'Switzerland', 'Spain', 'Italy', 'Belgium')

#Countries in the G20 + some to get the total to 20. Some of the G20 countries had no data.
Countries = c('Canada', 'Saudi.Arabia', 'United.States', 'India', 'Russian.Federation', 'South.Africa', 'Turkey', 'Brazil', 'Mexico', 'France', 'Germany', 'Italy', 'United.Kingdom', 'China', 'Sweden', 'Japan', 'Korea..Rep.', 'Switzerland', 'Spain', 'Italy', 'Belgium')

CPI <- read.csv("CPI.csv") #Used in 2, 3
CPIG20 <- CPI %>% select(one_of(Columns))
CPIG20 <- CPIG20[-1, ] #remove null first row
CPIG20 <- CPIG20[-1, ] #remove 1990
CPIG20 <- head(CPIG20, - 1) #remove last row (empty on most tables)
rownames(CPIG20) <- NULL #Reset row values

FReserves <- read.csv("ForeignReserves.csv") #Used in 6
FReservesG20 <- FReserves %>% select(one_of(Columns))
FReservesG20 <- FReservesG20[-1, ] #remove null first row
FReservesG20 <- head(FReservesG20, - 1) #remove last row 
rownames(FReservesG20) <- NULL #Reset row values

GDP <- read.csv("GDP.csv") #Used in 1
GDPG20 <- GDP %>% select(one_of(Columns))
GDPG20 <- GDPG20[-1, ] #remove null first row
GDPG20 <- GDPG20[-1, ] #remove 1990
GDPG20 <- head(GDPG20, - 1) #remove last row (empty on most tables)
rownames(GDPG20) <- NULL #Reset row values

StockMarket <- read.csv("StockMarket.csv") #Used in 4
StockMarketG20 <- StockMarket %>% select(one_of(Columns))
StockMarketG20 <- StockMarketG20[-1, ] #remove null first row
StockMarketG20 <- StockMarketG20[-1, ] #remove 1990
StockMarketG20 <- head(StockMarketG20, - 1) #remove last row (empty on most tables)
rownames(StockMarketG20) <- NULL #Reset row values

TotalReserves <- read.csv("TotalReserves.csv") #Used in 6
TotalReservesG20 <- TotalReserves %>% select(one_of(Columns))
TotalReservesG20 <- TotalReservesG20[-1, ] #remove null first row
TotalReservesG20 <- TotalReservesG20[-1, ] #remove 1990
TotalReservesG20 <- head(TotalReservesG20, - 1) #remove last row (empty on most tables)
rownames(TotalReservesG20) <- NULL #Reset row values

UnemploymentRate <- read.csv("UnemploymentRate.csv") #Used in 5
UnemploymentRateG20 <- UnemploymentRate %>% select(one_of(Columns))
UnemploymentRateG20 <- UnemploymentRateG20[-1, ] #remove null first row
UnemploymentRateG20 <- UnemploymentRateG20[-1, ] #remove 1990
UnemploymentRateG20 <- head(UnemploymentRateG20, - 1) #remove last row (empty on most tables)
rownames(UnemploymentRateG20) <- NULL #Reset row values
  1. Line Plot: Which 5 countries have the highest GDP in 2018? How are the plots related over time?

Expectations: We expect mostly western countries to have the highest GDP. This is because western countries prioritize freedom and capitalism. We expect this support to show up in the data, and we expect the plots to be mostly related.

Post Code: The 5 counties with the highest GDP in 2018 are: The United States, China, Japan, Germany, and UK in that order. We can see that the plots between the western countries are highly coordinated. This makes sense as these countries have both military and trade alliances. China is a notable outlier in this data. This country shows an exceptional growth rate starting in about 2005. China is an outlier in another way too, but this time joined by Japan. The US, Germany and the UK all suffered badly from 2008 and you can see extreme GDP loss and growth loss. Both Germany and the UK have yet to meaningfully break above it’s 2008 max. Overall our expectations were met.

summary(GDPG20)
##        X            Canada         Saudi.Arabia    United.States     
##  Min.   :1991   Min.   : 579059   Min.   :328375   Min.   : 6158100  
##  1st Qu.:1998   1st Qu.: 649782   1st Qu.:451739   1st Qu.: 8941500  
##  Median :2004   Median :1101535   Median :649039   Median :12625200  
##  Mean   :2004   Mean   :1142943   Mean   :591157   Mean   :12630757  
##  3rd Qu.:2011   3rd Qu.:1576904   3rd Qu.:724157   3rd Qu.:15706300  
##  Max.   :2018   Max.   :1846649   Max.   :782193   Max.   :20494200  
##                                   NA's   :14                         
##      India         Russian.Federation  South.Africa        Turkey      
##  Min.   : 290308   Min.   : 115811    Min.   :116612   Min.   :187473  
##  1st Qu.: 489428   1st Qu.: 340399    1st Qu.:138155   1st Qu.:251367  
##  Median :1141455   Median : 934606    Median :243800   Median :450405  
##  Mean   :1228519   Mean   :1023202    Mean   :239696   Mean   :515492  
##  3rd Qu.:1840370   3rd Qu.:1613752    3rd Qu.:326310   3rd Qu.:790821  
##  Max.   :2718578   Max.   :2280211    Max.   :417641   Max.   :948785  
##  NA's   :5         NA's   :2                                           
##      Brazil            Mexico            France           Germany       
##  Min.   : 511777   Min.   : 352035   Min.   :1360939   Min.   :1954674  
##  1st Qu.: 669195   1st Qu.: 527009   1st Qu.:1495213   1st Qu.:2222652  
##  Median :1106836   Median : 829427   Median :2155390   Median :2836442  
##  Mean   :1363320   Mean   : 840095   Mean   :2089589   Mean   :2893531  
##  3rd Qu.:1877122   3rd Qu.:1127903   3rd Qu.:2669880   3rd Qu.:3503618  
##  Max.   :2617293   Max.   :1313676   Max.   :2926868   Max.   :4006795  
##  NA's   :3                                                              
##      Italy         United.Kingdom        China              Sweden      
##  Min.   :1004008   Min.   :1151618   Min.   :  488522   Min.   :210153  
##  1st Qu.:1244337   1st Qu.:1609003   1st Qu.: 1060124   1st Qu.:266917  
##  Median :1815878   Median :2405983   Median : 2282510   Median :385760  
##  Mean   :1667115   Mean   :2153345   Mean   : 4522453   Mean   :390382  
##  3rd Qu.:2087325   3rd Qu.:2690112   3rd Qu.: 8027645   3rd Qu.:513633  
##  Max.   :2399198   Max.   :3085993   Max.   :13601165   Max.   :580017  
##                                      NA's   :1                          
##      Japan          Korea..Rep.       Switzerland         Spain        
##  Min.   :3590619   Min.   : 324177   Min.   :261865   Min.   : 452311  
##  1st Qu.:4444592   1st Qu.: 552106   1st Qu.:292585   1st Qu.: 615478  
##  Median :4827492   Median : 831865   Median :401480   Median :1114377  
##  Mean   :4793066   Mean   : 871664   Mean   :455377   Mean   :1006271  
##  3rd Qu.:4993991   3rd Qu.:1208254   3rd Qu.:668661   3rd Qu.:1366484  
##  Max.   :6202415   Max.   :1620151   Max.   :710774   Max.   :1640875  
##                                                                        
##     Belgium      
##  Min.   :219723  
##  1st Qu.:259059  
##  Median :379502  
##  Mean   :374437  
##  3rd Qu.:488713  
##  Max.   :532337  
## 
#Countries with the highest GDP in 2018 are United States, China, Japan, Germany, UK 

#Plot
plot <- data.frame(data = GDPG20,
                 x = GDPG20$X,
                 y = c(GDPG20$United.States,
                       GDPG20$China,
                       GDPG20$Japan,
                       GDPG20$Germany,
                       GDPG20$United.Kingdom),
                 group = rep(c("United States", "China", "Japan", "Germany", "United Kingdom"), each = 28))
  
ggplot(plot, aes(x, y, col = group)) +
  geom_line() + 
  labs(title = "Country GDP per Year", y = "GDP", x = "Year")
## Warning: Removed 1 row(s) containing missing values (geom_path).

  1. Barplot: Find the cumulative CPI change from 1991 to 2018 for all countries. Then plot in a barchart the cumulative change for each country. How does it look? How does each country compare to the healthy 28 year cumulative inflation rate (~56%)?

Expectations: As with the last qeustion we expect mostly western countries to have healthy data. We think there may be some blow outs with inflation, but we expect ~200% to be the maximum.

Post Code: From the chart we can see there are three countries which have an extreme amount of total cumulative inflation. Brazil, Russia, and Turkey all have over 1000% of cumulative total inflation. This is extreme and is deleterious to the economy of these nations. However, we can also see the flip side to that unhealthy coin when we view Japan. After 28 years Japan only has a cumulative inflation value of 10.7. This is low and unhealthy. This means on average Japan only had .38% inflation per year. Mexico, South Africa, China, India, Korea, Spain, Sweden, France, and Switzerland all had somewhat healthy rates. While the rest of the nations had healthy cumulative inflation rates. Our expections were somewhat met. We had not anticipated over 1000% inflation for some countries. On top of this most western countries were healthy.

test = which(unlist(lapply(CPIG20, is.numeric)))
CPIG20[(nrow(CPIG20) + 1), test] <- colSums(CPIG20[, test], na.rm=TRUE) 
#add a row that is cumulative inflation

#Create a table of just Country and Total Inflation
TotalCPI = CPIG20 %>%
  rowid_to_column() %>%
  filter(rowid == 29) %>%
  gather(col, val, Canada:Belgium) %>%
  select(-rowid, -X)
names(TotalCPI) <- c("Country", "Cum.Infaltion")

#Plot
MaxInflationPerCountryFigure <- ggplot(TotalCPI, aes(reorder(Country, -Cum.Infaltion), Cum.Infaltion)) +
  scale_y_continuous(trans='log10') +
  geom_col() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  labs(title = "Cumulative Inflation (1991 - 2018) per Country", y = "Cumulative Inflation", x = "Country")

MaxInflationPerCountryFigure <- ggplotly(MaxInflationPerCountryFigure)
MaxInflationPerCountryFigure
  1. Histogram: For each year in the CPI data find which county had the highest change. Then plot each year, country, and max value in a histogram. How does the data look?

Expectations: Based on the last question I think it is safe to assume Brazil, Russia, and Turkey will be the leaders for most years.

Post Code: We can see that in the first 3 years of the data there was a lot of inflation for Brazil and Russia. These values are so high they would almost be considered an outlier here. Instead, we can look at the year and history to see these countries had lots of internal issues. Especially in Russia as the USSR would have dissolved in 1991. From this we can conclude that stability of a country is necessary to have stable inflation. Our expectation were met everywhere expect 1 spot. We did not expect India to show up as a yearly inflation leader.

test = CPIG20 %>% replace(is.na(.), 0) #make all na values 0 in the table
YearlyMaxData <- data.frame(matrix(ncol = 3, nrow = 0))
#create new empty table

#for each year (1991 - 2018) find the country with the highest inflation. Then append that data to YearlyMaxData
for (x in 1:28) {
  #Find data
  InterYearMax = test %>%
  rowid_to_column() %>%
  filter(rowid == x) %>%
  gather(col, val, Canada:Belgium) %>%
  filter(val == max(val)) %>%
  select(-rowid)
  
  #Append data
  YearlyMaxData = rbind(YearlyMaxData, InterYearMax)
}

#Plot
MaxInflationPerYearFigure <- ggplot(data=YearlyMaxData, aes(x=X, y=val, fill=col)) +
  scale_y_continuous(trans='log10') +
  geom_bar(stat="identity") + 
  labs(title = "Highest Country Inflation (1991 - 2018) per Year", y = "Inflation", x = "Year")

MaxInflationPerYearFigure <- ggplotly(MaxInflationPerYearFigure)
MaxInflationPerYearFigure
  1. Time Series: Find the three countries that had the smallest stock market cap in 1991 and the three countries with the biggest stock market cap in 2018. Ignore na values. Then plot the countries on a Time Series Plot. Does any part of the chart surprise you?

Expectations: We expect mostly western countries to have the highest stock market cap. And we expect to be able to see the dot com bubble and the 2008 housing bubble.

Post Code: There are 2 notables facts about the data. The first is that Japan’s stock market cap starts out so high compared to the rest. It starts about 5 times bigger than any other country. And on top of this Japan does not keep the lead. By 1998 the US and Japan are about equal. By 2001 the United States was ahead. The other interesting part is that you can clearly see the 2008 financial crisis happen. Each country screamed higher on the years leading up and the years after it went down. Other than that the information is not surprising. Our expectations were fully met here.

summary(StockMarketG20)
##        X            Canada        Saudi.Arabia    United.States   
##  Min.   :1991   Min.   : 25.24   Min.   : 20.37   Min.   : 27.98  
##  1st Qu.:1998   1st Qu.: 40.87   1st Qu.: 34.16   1st Qu.: 79.55  
##  Median :2004   Median : 67.04   Median :100.43   Median :100.33  
##  Mean   :2004   Mean   : 70.95   Mean   : 89.05   Mean   :103.91  
##  3rd Qu.:2011   3rd Qu.:106.79   3rd Qu.:125.11   3rd Qu.:124.32  
##  Max.   :2018   Max.   :118.62   Max.   :204.09   Max.   :239.19  
##                                  NA's   :3                        
##      India        Russian.Federation  South.Africa        Turkey      
##  Min.   : 17.10   Min.   :  7.585    Min.   : 26.18   Min.   : 16.11  
##  1st Qu.: 25.56   1st Qu.: 29.620    1st Qu.: 76.32   1st Qu.: 24.68  
##  Median : 39.49   Median : 54.059    Median :105.48   Median : 59.29  
##  Mean   : 60.82   Mean   : 63.005    Mean   : 90.30   Mean   : 56.34  
##  3rd Qu.:101.03   3rd Qu.: 94.884    3rd Qu.:115.25   3rd Qu.: 80.24  
##  Max.   :137.83   Max.   :152.166    Max.   :122.21   Max.   :111.24  
##                   NA's   :4          NA's   :11                       
##      Brazil          Mexico           France          Germany      
##  Min.   :10.79   Min.   : 13.82   Min.   : 35.82   Min.   : 39.66  
##  1st Qu.:15.87   1st Qu.: 23.23   1st Qu.: 57.35   1st Qu.: 63.42  
##  Median :39.60   Median : 44.97   Median : 82.18   Median : 88.57  
##  Mean   :42.90   Mean   : 62.65   Mean   : 80.54   Mean   : 95.88  
##  3rd Qu.:65.04   3rd Qu.:104.65   3rd Qu.:100.48   3rd Qu.:127.25  
##  Max.   :99.28   Max.   :131.81   Max.   :140.24   Max.   :173.16  
##  NA's   :3                                         NA's   :5       
##      Italy        United.Kingdom       China            Sweden      
##  Min.   : 58.50   Min.   : 46.88   Min.   : 31.62   Min.   : 19.97  
##  1st Qu.: 77.66   1st Qu.: 73.95   1st Qu.: 77.17   1st Qu.: 47.32  
##  Median : 84.82   Median :100.33   Median : 89.56   Median : 79.53  
##  Mean   : 96.02   Mean   : 94.20   Mean   : 86.46   Mean   : 84.43  
##  3rd Qu.:115.26   3rd Qu.:114.87   3rd Qu.: 96.87   3rd Qu.:123.92  
##  Max.   :170.17   Max.   :148.01   Max.   :126.97   Max.   :156.86  
##  NA's   :6                         NA's   :14                       
##      Japan         Korea..Rep.      Switzerland         Spain       
##  Min.   : 68.89   Min.   : 73.55   Min.   : 18.05   Min.   : 17.58  
##  1st Qu.: 97.47   1st Qu.: 99.40   1st Qu.: 56.77   1st Qu.: 38.34  
##  Median :122.86   Median :117.00   Median : 79.33   Median : 56.28  
##  Mean   :120.58   Mean   :142.40   Mean   : 84.09   Mean   : 58.08  
##  3rd Qu.:139.02   3rd Qu.:147.42   3rd Qu.:110.16   3rd Qu.: 76.08  
##  Max.   :172.82   Max.   :306.33   Max.   :148.02   Max.   :128.38  
##                   NA's   :5                                         
##     Belgium      
##  Min.   : 38.80  
##  1st Qu.: 62.89  
##  Median : 91.24  
##  Mean   : 88.88  
##  3rd Qu.:110.69  
##  Max.   :168.11  
## 
#3 Countries with the smallest cap at the start are: India, Mexico, Spain
#3 Countries with the largest cap at the end are: US, Germany, Japan

#Plot
plot <- data.frame(data = StockMarketG20,
                 x = StockMarketG20$X,
                 y = c(StockMarketG20$India,
                       StockMarketG20$Mexico,
                       StockMarketG20$Spain,
                       StockMarketG20$United.States,
                       StockMarketG20$Germany,
                       StockMarketG20$Japan),
                 group = rep(c("India", "Mexico", "Spain", "United States", "Germany", "Japan"), each = 28))
  
ggplot(plot, aes(x, y, col = group)) +
  geom_line() + 
  labs(title = "Country Stock Market Cap per Year", y = "Stock Market Cap", x = "Year")
## Warning: Removed 5 row(s) containing missing values (geom_path).

  1. Box Plot: Make a box plot for each nations unemployment rate. Note 3 things about the data.

Expectations: We do not know what to expect with this data. Our only guess is that for some the rate may stay high.

Post Code: First we can see that South Africa has a really high unemployment rate and it seems to be consistently high. The variation for the data from south aftica is extremely low. On the other hand we can see countries with extremely high variation. Spain has a huge range in it’s data. The range from Q1 to Q3 of Spain is bigger than the total range of any other country. Finally we can see that a “healthy” range of unemployment seems to be around ~6%. We can come to this conclusion by looking at countries that are healthy and wealthy from the other questions. These countries include: Germany, UK, US. Going in we had almost no expectations and the one we slightly had did manifest with South Africa. Spain having a wide spread is notable.

#Plot
UnemploymentRateG20 %>% select(Countries) %>%
  pivot_longer(., cols = Countries, names_to = "Var", values_to = "Val") %>%
  ggplot(aes(x = Var, y = Val)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  geom_boxplot() + 
  labs(title = "Boxplot of unemployment % per country", y = "Unemployment %", x = "Country")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(Countries)` instead of `Countries` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Warning: Removed 120 rows containing non-finite values (stat_boxplot).

  1. Scatter Plot: Investigate the relationship between Total Reserves and Foreign reserves of Countries.

Expectations: We expect a straight line from origin to the top right. There are o other expectations, we believe this one will be cut and dry.

Post Code: This is the most interesting plot of all the plots. We expected this to be a linear trend line and there to be no variability. However, what we got was happily surprising. We can see that countries generally take three approaches to reserves. Either the country holds a lot Total Reserves and not a lot of Foreign Reserves or vice verse. The Final approach is to hold neither (bottom left corner). Notable points of data: UK blows out the competition in Foreign Reserves and China blows out the competition in Total Reserves. This makes sense as the UK would want to hold a lot of USD and China would hold a lot of yuan as they have a closed economic system and trade a lot with the US. Our expectations were blown away. This plot is fun to observe.

ZerodTotal = TotalReservesG20 %>% replace(is.na(.), 0) #Set all na values to 0
ZerodForeign = FReservesG20 %>% replace(is.na(.), 0) #Set all na values to 0

YearlyReserveData <- data.frame(matrix(ncol = 4, nrow = 0)) #Create a new table with 4 rows that is empty.

#for each year (1991 - 2018) get data from each country for Total Reserves in a place holder table. Then find add a column from foreign reserves data to place holder table. Then append the final place holder table data to YearlyReserveData.
for (x in 1:28) {
  #Get data from each country for Total Reserves
  InterYearTotal = ZerodTotal %>%
  rowid_to_column() %>%
  filter(rowid == x) %>%
  gather(col, val, Canada:Belgium) %>%
  select(-rowid)
  
  names(InterYearTotal) <- c("Year", "Country", "TotalReserves") #Rename
  
  #Get data from each country for Foreign Reserves
  InterYearForiegn = ZerodForeign %>%
  rowid_to_column() %>%
  filter(rowid == x) %>%
  gather(col, val, Canada:Belgium) %>%
  select(-rowid, -X, -col)
  
  InterYearTotal$ForiegnReserves = InterYearForiegn$val #Add column from Foreign reserves to total reserves data
  
  YearlyReserveData = rbind(YearlyReserveData, InterYearTotal) #Append final place holder table data to YearlyReserveData
}

#plot
YearlyReserveData %>% ggplot(aes(x = TotalReserves, y = ForiegnReserves, color = Country)) +
  geom_point() + 
  labs(title = "Scatter plot of Total vs Foreign Reserves per Country", y = "ForiegnReserves", x = "TotalReserves")